# Read in data from DQYDJ website and CPS original data. 
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
library(wCorr)
library(tidyr)
library(dplyr)
library(janitor)
library(readr)
library(ggplot2)
library(DescTools)


# CHANGE: IPUMS-CPS data name
data_name <- "[CPS NAME].xml"

# Read in raw data
path_data = "../data/"
raw_2021 <- read_ipums_ddi(paste0(path_data, data_name)) %>% read_ipums_micro

# Filter data for couples as per a custom methodology.
df <- raw_2021 %>% 
  clean_names %>% 
  filter(sploc != 0) %>% 
  # The criteria from DQYDJ.
  filter(marst == 1, age %in% 25:65, wkswork1 >= 20, classwkr %in% 20:28) %>%
  group_by(serial, sploc, pernum) %>% 
  mutate(pair = paste0(serial, "_", min(pernum, sploc), "-", max(pernum, sploc))) %>% 
  ungroup() %>%
  group_by(pair) %>% 
  mutate(len = length(pair)) %>% 
  filter(len > 1) %>% 
  # Construct empirical data from filtered data. 
  rename(wt = asecwt) %>% 
  arrange(incwage) %>%
  ungroup() %>% 
  mutate(pdf = wt / sum(wt), 
         cdf = cumsum(pdf),
         percentile = ceiling(100 * cdf)) 

# Adjust so no percentile is over 100
df$percentile[df$percentile > 100] = 100

# Compute statistics within percentiles
df <- df %>%  
  select(wt, pair, incwage, sex, pdf, cdf, percentile) %>% 
  group_by(percentile) %>% 
  mutate(earnings_threshold_percentile = min(incwage), 
         pdf_percentile = pdf / sum(pdf), 
         mean_incwage_percentile = sum(pdf_percentile * incwage)) %>% 
  ungroup()

# Write CSV
write_csv(df, paste0(path_data, "sample.csv"))
